home *** CD-ROM | disk | FTP | other *** search
- #!/usr/local/bin/perl
-
- # This package uses the global file handle htmlFile
- # There are two global assoc. arrays, endTags & handlerDict
-
- # parseHtml takes one argument, a filename
- # and returns the parsed html in a string
-
- sub parseHtml
- {
- # Declare variables to hold the arguments
- local($fileName) = @_;
-
- # Declare a variable to store the return value
- local($retVal);
-
- # Open the file
- open(htmlFile,$fileName);
-
- # If the file opened, call the parser on it
- $retVal = &mainHtmlParser("",0) if htmlFile;
-
- # Close the file
- close(htmlFile);
-
- # Return the string parsed from the file
- return $retVal;
- }
-
- # mainHtmlParser takes several arguments
- # This subroutine can either take a stop string, or a stop char
- # it reads the file htmlFile until either the end of file
- # the stopstring or the stop char is encountered.
- #
- # mainHtmlParser returns a string filtered from the file.
- # The filters are tag handlers and a default handler.
- # Handlers should take 5 arguments for:
- #
- # tagString - The string containing the tag
- # argString - Any data between the tag and end tag
- # endString - The end tag
- # tagDict - The dictionary created using dictForTag
- # userData - The user data argument
- #
- # Handlers are registered in the global dictionary
- # handlerDict.
- #
- # If the tag has a matching end tag like <HTML> and </HTML>
- # then the tag should be registered in the global
- # %endTags array, with the value equal to its end tag.
- #
- # If the tag needs the data up to the end of the line, like
- # OPTION, then if should appear in %endTags with the value
- # "eol".
- #
- # Handlers should return the string to replace the tag with.
- #
- # The default is used for text that wasn't part of a tag.
- # Tags are denoted by <text>.
- # As plain text is encountered the handler registered under
- # the string "DEFAULT" is called.
-
- sub mainHtmlParser
- {
- # Declare locals to store the arguments
- local($stopStr,$stopChar) = @_;
-
- # Declare several local variables
- local($char,$inTag,$tmpBuffer,$mainBuffer);
-
- # Initialize the main buffer, this is what is returned
- $mainBuffer = "";
-
- # $inTag is used to denote when we are inside <>'s
- $inTag = 0;
-
- # Loop until the end of the file, or
- # we encounter the stop string or stop character.
- do
- {
-
- # Get the next character from the file.
- # This is not the most effecient method of reading a file
- # But makes our code cleaner
-
- $char = getc(htmlFile);
-
- # Check if we are at the start of a tag
- if($char eq "<")
- {
- # Dont allow any tags inside other tags
- if($inTag)
- {
- die "This is an invalid html file.\n";
- }
- else
- {
- # Denote that we are in a tag
- $inTag = 1;
-
- # If we were reading plain text
- if($tmpBuffer)
- {
- # Handle the plain text
- $mainBuffer .= &handlePlainText($tmpBuffer);
-
- # Reset the tmp buffer
- $tmpBuffer = "";
- }
-
- # Start the new tmp buffer
- $tmpBuffer = "<";
- }
- }
- elsif($char eq ">") # Check if we are at the end of a tag
- {
- # Dont allow end tags without start tags
- if(! $inTag)
- {
- die "This is an invalid html file.\n";
- }
- else
- {
- # Denote the end of the tag
- $inTag = 0;
-
- # Finish the tmp buffer
- $tmpBuffer .= ">";
-
- # See if we are at the stop string
- if($stopStr && ($tmpBuffer =~ /$stopStr/i))
- {
- return $mainBuffer;#we have read to the stop string
- }
- else
- {
- # If not handle the tag, and keep reading
- $tmpBuffer = &handleTag($tmpBuffer);
-
- # Add the tmp buffer to the main buffer
- $mainBuffer .= $tmpBuffer;
-
- # Reset the tmp buffer
- $tmpBuffer = "";
- }
- }
- }
- elsif(eof(htmlFile)
- || ($stopChar && ($char eq $stopChar))) # check for stopchar
- {
-
- # Dont allow the parsing to end inside a tag
- if($inTag)
- {
- die "This is an invalid html file.\n";
- }
- else
- {
- # Add the character to the tmp buffer
- $tmpBuffer .= $char if (!eof(htmlFile));
-
- # Add the tmp buffer to the main buffer,
- # after handling it.
- $mainBuffer .= &handlePlainText($tmpBuffer);
-
- # Reset the tmp buffer
- $tmpBuffer = "";
- }
-
- # We are at the end of the file, or found
- # the stop string, so return the main buffer
- return $mainBuffer;
- }
- else # If nothing else add the character to the tmp buffer
- {
- $tmpBuffer .= $char;
- }
-
- }
- until(eof(htmlFile));
-
- # Return the main buffer
- return $mainBuffer;
- }
-
- #
- # handleTag actualy handles the tags for mainHtml parser
-
- sub handleTag
- {
- # Declare local variables for the argument, as well
- # as the other required locals.
-
- local($tagString) = @_;
- local(%tagDict,$endTag,$handler,$argString);
- local($evalString);
-
- # Create an associative array containing the data for the
- # tag string.
-
- %tagDict = &dictForTag($tagString);
-
- # Look for an end tag. These are registered in the %endTags
- # global associative array.
-
- $endTag = $endTags{$tagDict{"TAG"}};
-
- # Look for a handler subroutine for the tag.
- # These are registered in the %handlerDict global
- # associative array.
-
- $handler = $handlerDict{$tagDict{"TAG"}};
-
- # If no handler is found, treat the tag as plain text, and
- # return the parsed data.
-
- if(!($handler))
- {
- $tagString = &handlePlainText($tagString);
-
- return $tagString;
- }
-
- # If the tag wants the data to the end of the line
- # use mainHtmlParser to read to the end of the line, then
- # call the tag's handler subroutine with the data to the
- # end of the line.
-
- if($endTag eq "eol") # Tag that needs data to eol
- {
- $argString = &mainHtmlParser("","\n");
-
- $evalString = "&".$handler.'($tagString,$argString,0,%tagDict);';
- }
- elsif($endTag) # Tag with an end tag
- {
- # Use mainHtmlParser to read any text, up to
- # the end tag. Remove the end tag from the sting.
-
- $argString = &mainHtmlParser($endTag,0);
- $argString =~ s/<.*>$//; # Remove the end tag
-
- # Call the tag's handler
- $evalString = "&".$handler.'($tagString,$argString,$endTag,%tagDict);';
- }
- else # General unary tag
- {
- #For unary tags, simply call the handler.
- $evalString = "&".$handler.'($tagString,0,0,%tagDict);';
- }
-
- $tagString = eval($evalString);
-
- # Return the parsed text.
- return $tagString;
- }
-
- # handlePlainText actually handles plain text for htmlMainParser
-
- sub handlePlainText
- {
- # Declare the locals
-
- local($plainString) = @_;
- local($handler,$evalString);
-
- # Look for a default handler for plain text
- $handler = $handlerDict{"DEFAULT"};
-
- #If there is a handler, call it and catch the return value.
-
- if($handler)
- {
- $evalString = "&".$handler.'($plainString,0,0,0);';
- $plainString = eval($evalString);
- }
-
- # Return either the text passed in, or the parsed text if there
- # was a default handler.
-
- return $plainString;
- }
-
- # Creates an associative array for a tag string
-
- sub dictForTag
- {
- # Declare locals
- local($tagString) = @_;
- local(%tagDict,$key);
-
- # Look for the tag
- # Remove it from the tag string
- # Capitalize the tag, and put it into the dict
- # with the key, TAG
- # If no tag is found, then this is not a tag string.
-
- if(($tagString =~ s/^<(\w*)[\s>]//) && $1)
- {
- ($key = $1) =~ tr/a-z/A-Z/; # Make the tag upper case
-
- $tagDict{"TAG"} = $key;
- }
- elsif(($tagString =~ s/^<!--(\w*)[\s>]//) && $1)
- {
- ($key = $1) =~ tr/a-z/A-Z/; # Make the tag upper case
-
- $tagDict{"TAG"} = $key;
- }
- else
- {
- return %tagDict;
- }
-
- # Find all of the tag's key/value attrubutes
- # Remove them from the tag string.
-
- while($tagString =~ s/(\w*)\s*=\s*\"([^\"]*)\"//)
- {
-
- if($1)
- {
- ($key = $1) =~ tr/a-z/A-Z/; # Make upper case
-
- if($2)
- {
- $tagDict{$key} = $2; # Add the key to the dict
- }
- else
- {
- $tagDict{$key} = "";
- }
- }
- }
-
- # Find the single attributes
- # and remove them from the string.
- while($tagString =~ s/\s+(\w*)[\s>]*//)
- {
- if($1)
- {
- ($key = $1) =~ tr/a-z/A-Z/; # Make upper case
- $tagDict{$key} = $key; # Add to the dict
- }
- }
-
- return %tagDict;
- }
-
- # Creates a string from a tag dictionary
-
- sub stringForTagDict
- {
- # Declare locals
- local(%tagDict) = @_;
- local($tagString);
-
- # If there was a tag dictionary passed in
- if(%tagDict)
- {
- #If the tag dictionary has a TAG in it, build the tag string
- if($tagDict{"TAG"})
- {
- # Start the string with a < and the tag
-
- $tagString .= "<";
- $tagString .= $tagDict{"TAG"};
-
- # Add the keys to the string
-
- foreach $key (keys %tagDict)
- {
- # Ignore TAG, we already added it
-
- if($key eq "TAG")
- {
- next;
- }
- elsif($key eq $tagDict{$key}) # unary attribute
- {
- $tagString .= " ";
- $tagString .= $key;
- }
- elsif($tagDict{$key}) #key/value attributes
- {
- $tagString .= " ";
- $tagString .= $key;
- $tagString .= "= \"";
- $tagString .= $tagDict{$key};
- $tagString .= "\"";
- }
- }
-
- #Close the tag string
- $tagString .= ">";
- }
- }
-
- #Return the tag string
- return $tagString;
- }
-
- 1;
-
-
-
-
-